
# ______ LIBRARIES ______
import os
import pandas as pd
import numpy as np
import ujson as json
import glob
import itertools
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from matplotlib import font_manager
from scipy import stats



# ______ IMPORT 1 ______
path_root = '/maps/securegroupdir/tzp831/'
path_data = path_root + 'SAMF-STAT-HOPE/03 Data/nordic-twitter-dataset/nordic-tweets-2020-09-17/'

df1 = []
for filename in os.listdir(path_data):
    df1.append(pd.read_csv(path_data + filename, sep = '\t'))

df1 = pd.concat(df1,ignore_index=True)
df1.drop(columns = ['filter_level',
                   'withheld_copyright',
                   'withheld_scope',
                   'truncated',
                   'retweet_count',
                   'favorite_count',
                   'from_user_utcoffset',
                   'from_user_timezone',
                   'from_user_lang',
                   'from_user_withheld_scope'], inplace=True)
df1 = df1[(df1['lang'].str.contains("da"))]
df1['text'] = df1['text'].str.lower()
# df1.to_csv('dataset1.csv', index=None)



# ______ IMPORT 2 ______
path_root = '/maps/securegroupdir/tzp831/'
path_data = path_root + 'SAMF-STAT-HOPE/03 Data/nordic-twitter-dataset/nordic-tweets2-2020-09-17/'

df2 = []
for filename in os.listdir(path_data):
    df2.append(pd.read_csv(path_data + filename, sep = '\t'))

df2 = pd.concat(df2,ignore_index=True)
df2.drop(columns = ['filter_level',
                   'withheld_copyright',
                   'withheld_scope',
                   'truncated',
                   'retweet_count',
                   'favorite_count',
                   'from_user_utcoffset',
                   'from_user_timezone',
                   'from_user_lang',
                   'from_user_withheld_scope'], inplace=True)
df2 = df2[(df2['lang'].str.contains("da"))]
df2['text'] = df2['text'].str.lower()
# df2.to_csv('dataset2.csv', index=None)


# ______ IMPORT 3 ______

path_root = '/maps/securegroupdir/tzp831/'
path_data = path_root + 'SAMF-STAT-HOPE/03 Data/nordic-twitter-dataset/twitter-da-historic/'

df3 = []
for filename in os.listdir(path_data):
    df3.append(pd.DataFrame.from_records(map(json.loads, open(path_data + filename, encoding="utf-8"))))
df3 = pd.concat(df3,ignore_index=True)

# Drop columns dropped in the live-scraped tweets df
df.drop(columns = ['filter_level',
                   'truncated',
                   'retweet_count',
                   'favorite_count'
                   'contributors',
                   'coordinates',
                   'display_text_range',
                   'entities',
                   'extended_entities',
                   'extended_tweet',
                   'favorited',
                   'id_str',
                   'in_reply_to_user_id',
                   'in_reply_to_user_id_str',
                   'is_quote_status',
                   'quote_count',
                   'quoted_status',
                   'quoted_status_id_str',
                   'quoted_status_permalink',
                   'reply_count',
                   'retweeted',
                   'retweeted_status',
                   'scopes'], inplace=True)


# Expand the USER information column into several columns
df3 = pd.concat([df3,df3['user'].apply(pd.Series)],axis=1)

# Some duplicate column names. Add _1 to the duplicates to differentiate
cols=pd.Series(df3.columns)
for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
df3.columns=cols

# drop unnecessary columns
df3.drop(columns=['user',
                 'in_reply_to_status_id_str',
                 'id_str',
                 'translator_type',
                 'derived',
                 'id_str',
                 'protected',
                 'utc_offset',
                 'time_zone',
                 'geo_enabled',
                 'lang_1',
                 'contributors_enabled',
                 'is_translator',
                 'profile_background_color',
                 'profile_background_image_url',
                 'profile_background_image_url_https',
                 'profile_background_tile', 
                 'profile_link_color',
                 'profile_sidebar_border_color', 
                 'profile_sidebar_fill_color',
                 'profile_text_color', 
                 'profile_use_background_image',
                 'profile_image_url_https', 
                 'profile_banner_url',
                 'default_profile', 
                 'default_profile_image', 
                 'following',
                 'follow_request_sent', 
                 'notifications'], inplace=True)
df3 = pd.concat([df3,df3['geo'].apply(pd.Series)],axis=1)
coords = df3['coordinates'].apply(pd.Series)
coords.columns = ['lat','lng']
df3 = pd.concat([df3,coords],axis=1)
df3.drop(columns=['geo', 'type', 'coordinates', 'matching_rules'], inplace=True)
df3['created_at'] = pd.to_datetime(df3['created_at'])
df3 = df3[['id', 'created_at', 'from_user_name', 'text',
       'possibly_sensitive', 'lang', 'to_user_name', 'in_reply_to_status_id',
       'quoted_status_id', 'source', 'location', 'lat', 'lng', 'from_user_id',
       'from_user_realname', 'from_user_verified', 'from_user_description',
       'from_user_url', 'from_user_profile_image_url', 'from_user_tweetcount',
       'from_user_followercount', 'from_user_friendcount',
       'from_user_favourites_count', 'from_user_listed',
       'from_user_created_at']]

df3 = df3[(df3['lang'].str.contains("da"))]
df3['text'] = df3['text'].str.lower()
#df3.to_csv('dataset3.csv', index=None)


# ______ COMBINING DATASET 1+2+3 ______
df1['dataset'] = 1
df2['dataset'] = 2
df3['dataset'] = 3
df = pd.concat([df1,df2])
df = pd.concat([df,df3])


# ______ COVID-19 MASK ______
list_covid = ['covid',
              'corona',
              'epidemi',
              'virus',
              'smitte',
              'pandemi',
              'karantæne',
              'krise',
              'udbrud',
              'lockdown',
              'lukket ned',
              'nedlukning',
              'afstand',
              'bliv hjemme']
pattern_covid = '|'.join(list_covid)
df['covid'] = df['text'].str.contains(pattern_covid).map({True: 1, False: 0})
df = df[df['covid'] == 1]

df.sort_values(by='created_at',axis=0, inplace=True)
df.drop(columns=['covid'], inplace=True)
df.created_at = pd.to_datetime(df['created_at'])
df['from_user_created_at'] = pd.to_datetime(df['from_user_created_at'])

#df.to_csv("big_COVID_dataset.csv",index=None)







# ______ KEYWORDS ______
df_keywords = pd.read_csv("keywords3.csv")

df_keywords['Broad Mask'] = df_keywords['Broad Mask'].str.lower().str.split(r"[+|;]")
df_keywords['Keywords'] = df_keywords['Keywords'].str.lower().str.split(r'[+|;|:|,]')
df_keywords['Exclude'] = df_keywords['Exclude'].str.lower().str.split(r'[+|;|:|,]')
df_keywords.set_index('#', inplace=True)

narrow = df_keywords['Keywords'].dropna().to_dict()
broad = df_keywords['Broad Mask'].dropna().to_dict()
exclude = df_keywords['Exclude'].dropna().to_dict()
story = df_keywords['Titel'].dropna().to_dict()

themes = {
    1: [78, 99, 84, 9, 76, 80, 73],
    2: [88, 89, 90, 91, 92, 93, 94, 95, 96, 86, 85, 77, 79, 81, 74, 72, 68, 67, 63, 55, 49, 69, 54],
    3: [82, 71, 62, 59, 57, 58, 97, 62, 45, 46, 43, 36, 34, 25, 26, 22, 16, 18, 14, 10, 7, 52, 70],
    4: [47, 41, 39, 23, 11, 8, 3, 1, 23],
    5: [56, 50, 32, 24, 27, 33],
    6: [29, 2, 21, 60, 53, 51, 64, 44, 38, 35, 28, 20, 100, 19, 15, 12, 4, 6, 5, 13], 
    7: [81, 75],
    8: [65, 83, 101]
}

themes_meaning = {
    1: 'contamination risks',
    2: 'fake cures',
    3: 'corona origins',
    4: 'facemasks',
    5: 'vaccine',
    6: 'government fears',
    7: 'home tests',
    8: 'other'
}
drop = [102, 30, 31, 37, 42, 40] # mundbind-related stories we have decided to remove


# ______ THEMES ______
corona_tweets = pd.read_csv('big_COVID_dataset.csv')

corona_tweets['story'] = np.empty((len(corona_tweets), 0)).tolist()
for key in broad.keys():
    corona_tweets[corona_tweets['text'].str.contains('|'.join(broad[key]))]['story'].apply(lambda x: x.append(key))
broad_tweets = corona_tweets[corona_tweets['story'].map(lambda d: len(d)) > 0] #Remove all non-marked stories

for k in narrow.keys():
    relevant = broad_tweets[broad_tweets['story'].apply(lambda x: k in x)]
    relevant[~relevant['text'].str.contains('|'.join(narrow[k]))]['story'].apply(lambda x: x.remove(k))
for k in exclude.keys():
    relevant = broad_tweets[broad_tweets['story'].apply(lambda x: k in x)]
    relevant[relevant['text'].str.contains('|'.join(exclude[k]))]['story'].apply(lambda x: x.remove(k))    
narrow_tweets = broad_tweets[broad_tweets['story'].map(lambda d: len(d)) > 0] #Remove all non-marked stories
df = narrow_tweets

# Drop the stories we have decided to remove
for k in drop:
    relevant = df[df['story'].apply(lambda x: k in x)]
    relevant['story'].apply(lambda x: x.remove(k))    
df = df[df['story'].map(lambda d: len(d)) > 0] #Remove all non-marked stories

df['theme'] = 0
for t in themes.keys():
    df['theme'].mask(df.apply(lambda x: len(set(x['story']).intersection(set(themes[t]))) > 0,axis=1), t, inplace=True)





# ______ PREP ANNOTATION ______
# We do not annotate retweets or duplicates. So save a link to the tweet of the set that is annotated
## NOTE THIS TAKES A LONG TIME TO RUN. Therefore, I have saved the outcome in the file "final_dataset.csv"
df['annotation_id'] = df['id']

og = df[df['text'].str.contains(r'^rt @')].text.str.split(': ', n=1, expand=True)[1].unique().tolist()
og = [tweet[:-3] for tweet in og]

for text in og:
    # We keep the annotation ID of the FIRST tweet with this text
    if df[df.text.str.contains(text, regex=False)].shape[0] > 1:
        df['annotation_id'].mask(df.text.str.contains(text, regex=False), df[df.text.str.contains(text, regex=False)].iloc[0].id, inplace=True)

# some duplicate tweets have the same text but different links, remove those too as we do not look at the links
df2 = df.loc[df['annotation_id'].drop_duplicates().index]
splits = df2.text.str.split('https', n=1, expand=True)
messages = splits.dropna()[0].unique()
for msg in messages:
    if msg == '':
        continue
    df['annotation_id'].mask(splits[0] == msg, df.loc[splits[splits[0]==msg].index].iloc[0].id, inplace=True)

for_annotation = df.loc[:,['text', 'theme', 'story','created_at','annotation_id','in_reply_to_status_id']]
for_annotation.drop_duplicates(subset='annotation_id', inplace=True) # remove all directly duplicate tweets

# include link to conversation for tweets in response to something
links = for_annotation.in_reply_to_status_id.dropna().map(lambda x: "https://twitter.com/anyuser/status/" + str(np.int64(x)))
for_annotation['context'] = links
for_annotation.drop('in_reply_to_status_id', axis=1, inplace=True)

# Create columns to fill in!
for_annotation['annotation?'] = ' '
for_annotation['humor?'] = ' '
for_annotation['difficult?'] = ' '
cols = ['annotation?', 'humor?', 'difficult?', 'text', 'theme', 'story', 'context', 'created_at', 'annotation_id' ]
for_annotation = for_annotation[cols]
for_annotation.to_csv("all_to_annotate.csv",encoding='utf-8', index=False)

df.drop(columns=['context','created_at', 'story'], inplace=True)
df.replace(r'http\S+', '#URL#',regex=True, inplace=True)
#df.to_csv("final_dataset.csv",encoding='utf-8', index=False)



# ______ CREATES BATCHES OF THEME ______
for i in range(1,7):
    send = df[df["theme"] == i].drop(labels='theme', axis=1).sample(n=110)
    send.to_excel(f"annotationtest_theme{i}.xlsx", encoding='utf-8', index=False)

theme = 4 #face mask
sample = df[df["theme"] == theme].sample(frac=1).drop(labels='theme', axis=1)
sample['annotation_id'] = sample['annotation_id'].astype(str) #keep as string to prevent corruption upon save to excel document
sample = df[df["theme"] == theme].sample(n=100).drop(labels='theme', axis=1)
sample.to_excel("")

# Subset into batches of the same size
num = 7 #number annotators
set_size = int(sample.shape[0]/num)
for i in range(1, num):
    # Save each batch into its own excel sheet
    sample.iloc[i*set_size-set_size: i*set_size].to_excel(f"02 Batches//theme{theme}_batch{i}.xlsx", encoding='utf-8', index=False)
# Save the remainder into a final excel sheet
sample.iloc[i*set_size:].to_excel(f"02 Batches//theme{theme}_batch{i+1}.xlsx", encoding='utf-8', index=False)

# Subset into batches of size 1000.
for i in range(1, 1+int((sample.shape[0] - sample.shape[0]%1000)/1000)):
    # Save each batch into its own excel sheet
    sample.iloc[i*1000-1000: i*1000].to_excel(f"02 Batches//theme{theme}_batch{i}.xlsx", encoding='utf-8', index=False)
# Save the remainder into a final excel sheet
sample.iloc[i*1000:].to_excel(f"02 Batches//theme{theme}_batch{i+1}.xlsx", encoding='utf-8', index=False)





# ______ INTERCODER RELIABILITY ______
# obtained from the internet (see the towardsdatascience article)

def fleiss_kappa(M):
    """Computes Fleiss' kappa for group of annotators.
    :param M: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories.
        'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category.
    :type: numpy matrix
    :rtype: float
    :return: Fleiss' kappa score
    """
    N, k = M.shape  # N is # of items, k is # of categories
    n_annotators = float(np.sum(M[0, :]))  # # of annotators
    tot_annotations = N * n_annotators  # the total # of annotations
    category_sum = np.sum(M, axis=0)  # the sum of each category over all items

    # chance agreement
    p = category_sum / tot_annotations  # the distribution of each category over all annotations
    PbarE = np.sum(p * p)  # average chance agreement over all categories

    # observed agreement
    P = (np.sum(M * M, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
    Pbar = np.sum(P) / N  # add all observed agreement chances per item and divide by amount of items
    return round((Pbar - PbarE) / (1 - PbarE), 4)

def nominal_metric(a, b):
    return a != b

def interval_metric(a, b):
    return (a-b)**2

def ratio_metric(a, b):
    return ((a-b)/(a+b))**2

def krippendorff_alpha(data, metric=interval_metric, force_vecmath=False, convert_items=float, missing_items=None):
    '''
    Calculate Krippendorff's alpha (inter-rater reliability):
    
    data is in the format
    [
        {unit1:value, unit2:value, ...},  # coder 1
        {unit1:value, unit3:value, ...},   # coder 2
        ...                            # more coders
    ]
    or 
    it is a sequence of (masked) sequences (list, numpy.array, numpy.ma.array, e.g.) with rows corresponding to coders and columns to items
    
    metric: function calculating the pairwise distance
    force_vecmath: force vector math for custom metrics (numpy required)
    convert_items: function for the type conversion of items (default: float)
    missing_items: indicator for missing items (default: None)
    
    from https://github.com/grrrr/krippendorff-alpha/blob/master/krippendorff_alpha.py
    '''
    
    # number of coders
    m = len(data)
    
    # set of constants identifying missing values
    if missing_items is None:
        maskitems = []
    else:
        maskitems = list(missing_items)
    if np is not None:
        maskitems.append(np.ma.masked_singleton)
    
    # convert input data to a dict of items
    units = {}
    for d in data:
        try:
            # try if d behaves as a dict
            diter = d.items()
        except AttributeError:
            # sequence assumed for d
            diter = enumerate(d)
            
        for it, g in diter:
            if g not in maskitems:
                try:
                    its = units[it]
                except KeyError:
                    its = []
                    units[it] = its
                its.append(convert_items(g))


    units = dict((it, d) for it, d in units.items() if len(d) > 1)  # units with pairable values
    n = sum(len(pv) for pv in units.values())  # number of pairable values
    
    if n == 0:
        raise ValueError("No items to compare.")
    
    np_metric = (np is not None) and ((metric in (interval_metric, nominal_metric, ratio_metric)) or force_vecmath)
    
    Do = 0.
    for grades in units.values():
        if np_metric:
            gr = np.asarray(grades)
            Du = sum(np.sum(metric(gr, gri)) for gri in gr)
        else:
            Du = sum(metric(gi, gj) for gi in grades for gj in grades)
        Do += Du/float(len(grades)-1)
    Do /= float(n)

    if Do == 0:
        return 1.

    De = 0.
    for g1 in units.values():
        if np_metric:
            d1 = np.asarray(g1)
            for g2 in units.values():
                De += sum(np.sum(metric(d1, gj)) for gj in g2)
        else:
            for g2 in units.values():
                De += sum(metric(gi, gj) for gi in g1 for gj in g2)
    De /= float(n*(n-1))

    return 1.-Do/De if (Do and De) else 1.


annotations = []
for file in glob.glob('01 IAA Subsets/*'): #PUT PATH TO ANNOTATED DATA HERE
    annotations.append(pd.read_excel(file))

translation = {
    'Irrelevant': 0,
    'irrelevant': 0,
    'Afstandtagen fra misinformation': 1,
    'afstandtagen fra misinformation': 1,
    'Spreder misinformation': 2,
    'Spreder': 2,
    'Misinformation': 2
}
tags = []
for annotation in annotations:
    tags.append(list(annotation['Unnamed: 3'].map(translation).iloc[3:]))
print("Fleiss Kappa (sanity check)", fleiss_kappa(np.array(tags)))
print("KRIPPENDORFF ALPHA (what we use):", krippendorff_alpha(np.array(tags)))


humor = []
for annotation in annotations:
    humor.append(list(annotations[0]['Unnamed: 4'].iloc[3:].map({'Ja':1,np.nan:0})))
print(fleiss_kappa(np.array(humor)))
print(krippendorff_alpha(np.array(humor)))

hard = []
for annotation in annotations:
    hard.append(list(annotations[0]['Unnamed: 5'].iloc[3:].map({'Ja':1,np.nan:0})))
print(fleiss_kappa(np.array(hard)))
print(krippendorff_alpha(np.array(hard)))


for df in annotations:
    df.loc[df['Unnamed: 5'] == 'Ja', 'Unnamed: 3'] = np.nan
n_tags = []
for annotation in annotations:
    n_tags.append(list(annotation['Unnamed: 3'].map(translation).iloc[3:]))
a = np.array(n_tags)
#drop all columns (aka answers) where SOMEONE answered that is was hard
print(fleiss_kappa(a[:, ~np.isnan(a).any(axis=0)]))
print(krippendorff_alpha(a[:, ~np.isnan(a).any(axis=0)]))





# ______ ADDING ANNOTATION TO DATASET ______
#theme4 is all annotated unique tweets from theme 4
theme4 = pd.read_csv(f"theme4.csv") 
theme4[theme4['score'] == 1].to_excel("combat_theme4.xlsx", index=False)
theme4[theme4['score'] == 2].to_excel("spread_theme4.xlsx", index=False)

# Debug theme4
print("Total tweets to save:", theme4full.shape[0])
good = theme4full.merge(theme4, on="text")
all_annos = theme4
good.set_index('text', inplace=True)
missed = theme4full.merge(theme4, on="text", how="outer")
missed.set_index('text', inplace=True)
missed.drop(good.index, inplace=True)
print("Caught by text overlap:", good.shape[0])
good.reset_index(inplace=True)
good.rename({"index":"text"}, axis=1, inplace=True)
missed.reset_index(inplace=True)
missed.rename({"index":"text"}, axis=1, inplace=True)
missed_full = missed[missed['id_y'].isna()]
missed_annos = missed[missed['id_x'].isna()]
print("Remainder:", missed_full.shape[0])

# add those labeled correctly!
missed_full.dropna(axis=1, how='all', inplace=True)
new = missed_full.merge(all_annos, left_on="annotation_id", right_on="id").drop(columns='text_x')
new.rename({'text_y':'text'},axis = 1, inplace=True)
good = pd.concat((good, new))
missed.set_index('annotation_id', inplace=True)
new.set_index('annotation_id', inplace=True)
missed.drop(new.index, inplace=True)
good.reset_index(inplace=True)
good.rename({"index":"annotation_id"}, axis=1, inplace=True)
missed.reset_index(inplace=True)
missed.rename({"index":"annotation_id"}, axis=1, inplace=True)
print("Caught by text overlap:", good.shape[0])
missed_full = missed[missed['id_y'].isna()]
missed_annos = missed[missed['id_x'].isna()]
print("Remainder:", missed_full.shape[0])
good.drop(columns=['theme','theme_y'], inplace=True)
good.rename({"theme_x":"theme"}, axis=1, inplace=True)

rts = missed_full[missed_full['text'].str.contains(r'^rt @')].index
missed_full['text'].loc[rts] = missed_full.loc[rts]['text'].str.split(': ', n=1, expand=True)[1]
missed_full['text'].loc[rts] = missed_full.loc[rts]['text'].map(lambda x: x[:-3])
missed_full.replace(r'http\S+', '#URL#',regex=True, inplace=True)

all_annos.replace(r'http\S+', '#URL#',regex=True, inplace=True)
splits = missed_full.text.str.split(' #URL#', n=1, expand=True)
missed_full['text'] = splits[0]
splits = all_annos.text.str.split(' #URL#', n=1, expand=True)
all_annos['text'] = splits[0]

missed_full.dropna(axis=1, how='all', inplace=True)
new = missed_full.merge(all_annos, on="text")

good_keep = good['annotation_id'].iloc[:,1]
good.drop(columns=['annotation_id'], inplace=True)
good['annotation_id'] = good_keep

good = pd.concat((good, new))
good.set_index('text', inplace=True)
new.set_index('text',inplace=True)
missed_full.set_index('text', inplace=True)
missed_full.drop(new.index, inplace=True)
good.reset_index(inplace=True)
good.rename({"index":"text"}, axis=1, inplace=True)
missed_full.reset_index(inplace=True)
missed_full.rename({"index":"text"}, axis=1, inplace=True)
print("Caught by text overlap:", good.shape[0])
print("Remainder:", missed_full.shape[0])


# Last ditch effort: Let's just look at the first 50 characters now...?
missed_full['text'] = missed_full['text'].map(lambda x: x[:50])
all_annos['text'] = all_annos['text'].map(lambda x: x[:50])
missed_full.dropna(axis=1, how='all', inplace=True)
new = missed_full.merge(all_annos, on="text")
good = pd.concat((good, new))
good.set_index('text', inplace=True)
new.set_index('text',inplace=True)
missed_full.set_index('text', inplace=True)
missed_full.drop(new.index, inplace=True)
good.reset_index(inplace=True)
good.rename({"index":"text"}, axis=1, inplace=True)
missed_full.reset_index(inplace=True)
missed_full.rename({"index":"text"}, axis=1, inplace=True)
print("Caught by text overlap:", good.shape[0])
print("Remainder:", missed_full.shape[0])

missed_full['score'] = 0
missed_full['humor?'] = 0
missed_full['difficult?'] = 0
missed_full['annotator'] = np.nan

missed_full.loc[missed_full[missed_full['text'] == 'sundhedsstyrelsen anbefaler nu, at passagerer brug'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'hvis du gerne vil sikre dig, at du køber ce-mærked'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'vi har udvidet vores anbefalinger om brug af mundb'].index,'annotator'] = 'HK'
missed_full.loc[missed_full[missed_full['text'] == 'chefen kan ikke bestemme om man bærer mundbind i m'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'vigtigt at skelne mellem #mundbind (dråbesmitte) o'].index,'annotator'] = 'HK'
missed_full.loc[missed_full[missed_full['text'] == '@anmevase @regeringdk mundbind er tåbeligt🤦\u200d♂️  ti'].index,'annotator'] = 'MLJ'
missed_full.loc[missed_full[missed_full['text'] == '@anmevase @regeringdk mundbind er tåbeligt🤦\u200d♂️  ti'].index,'score'] = 2
missed_full.loc[missed_full[missed_full['text'] == '@anmevase @regeringdk mundbind er tåbeligt🤦\u200d♂️  ti'].index,'humor?'] = 1
missed_full.loc[missed_full[missed_full['text'] == 'hvilket mundbind skal du vælge? 😷 vi anbefaler ce-'].index,'annotator'] = 'MM'
missed_full.loc[missed_full[missed_full['text'] == 'men hvad vil de koste?  “samtidig har den kæmpe ef'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'mundbind på lager, til 199 kr. for 50 stk. ce mærk'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == '𝗠𝗮𝘀𝗸 𝗴𝘂𝗶𝗱𝗲 𝟭𝟬𝟭 - mens vi venter på en vaccine, så '].index,'annotator'] = 'HK'
missed_full.loc[missed_full[missed_full['text'] == 'klar til #dlfkongres med krav til #ok21 og valg af'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'jeg er imponeret over, hvor hurtigt vi i danmark h'].index,'annotator'] = 'FKL'
missed_full.loc[missed_full[missed_full['text'] == 'sosserne fra @spolitik har det ikke nemt med at ov'].index,'annotator'] = 'SK'
missed_full.loc[missed_full[missed_full['text'] == 'statsministeren fejlede i sin åbningstale ved ikke'].index,'annotator'] = 'MM'
missed_full.loc[missed_full[missed_full['text'] == 'fra i morgen bliver mundbind en meget synlig del a'].index,'annotator'] = 'FKL'
missed_full.loc[missed_full[missed_full['text'] == '"seks timer før de nye mundbindskrav træder i kraf'].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'nogle af os med handicap får skæld ud. og vi hører'].index,'annotator'] = 'HK'
missed_full.loc[missed_full[missed_full['text'] == 'her forklarer social- og indenrigsminister @astrid'].index,'annotator'] = 'SK'
missed_full.loc[missed_full[missed_full['text'] == 'indtil videre er @videnskabdk - ikke overraskende '].index,'annotator'] = 'IKWT'
missed_full.loc[missed_full[missed_full['text'] == 'en voldsom morgen i rema i sunds, tak til \u2066@midtve'].index,'annotator'] = 'MM'\

caught = missed_full[~missed_full['annotator'].isna()]
missed_full = missed_full[missed_full['annotator'].isna()]
good = pd.concat((good, caught))
print("Caught by text overlap:", good.shape[0])
print("Remainder:", missed_full.shape[0])

test = good
test.drop_duplicates('id_x', inplace=True)
realign = test.loc[:,['id','score','humor?','difficult?']]
#realign.id_x = realign.id.astype(np.int64)
withrt = theme4full.merge(realign,left_on='id', right_on='id_y')




# ______ VISUALIZATION 1 ______
names = {
    0: 'Irrelevant',
    1: 'Combat misinformation',
    2: 'Spread misinformation'
}
bar = test.score.map(names).value_counts()
bar


fig, ax = plt.subplots(figsize=(4,4))
ax.margins(x=1.0)
ax.bar(x=0, height=bar['Irrelevant']/test.shape[0], color='darkgrey', width=0.5, label="Irrelevant")
ax.bar(x=0,height=bar['Spread misinformation']/test.shape[0], bottom=bar['Irrelevant']/test.shape[0], width=0.5, color='lightcoral', label='Spreading misinformation')
ax.bar(x=0,height=bar['Combat misinformation']/test.shape[0], bottom=(bar['Irrelevant']+bar['Spread misinformation'])/test.shape[0], width=0.5, color='lightgreen', label='Combatting misinformation')
ax.yaxis.set_major_locator(MultipleLocator(0.2))
ax.set_title("Proportion of Misinformation on Twitter \nin Facemask-related discussions")
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.grid(which='major', alpha=0.3, linestyle='-')
ax.yaxis.grid(which='minor', linestyle=':')
ax.get_xaxis().set_visible(False)
plt.show()



uniques = theme4.score.map(names).value_counts()
fig, ax = plt.subplots(figsize=(6,4))
ax.margins(x=0.4)
ax.bar(x=0, height=uniques['Combat misinformation']/(uniques['Combat misinformation']+uniques['Spread misinformation']), color='lightgreen', width=0.5, label="Combatting misinformation")
ax.bar(x=0,height=uniques['Spread misinformation']/(uniques['Combat misinformation']+uniques['Spread misinformation']), bottom=uniques['Combat misinformation']/(uniques['Combat misinformation']+uniques['Spread misinformation']), width=0.5, color='lightcoral', label='Spreading misinformation')
ax.bar(x=1, height=bar['Combat misinformation']/(bar['Combat misinformation']+bar['Spread misinformation']), color='lightgreen', width=0.5, label="Combatting misinformation")
ax.bar(x=1,height=bar['Spread misinformation']/(bar['Combat misinformation']+bar['Spread misinformation']), bottom=bar['Combat misinformation']/(bar['Combat misinformation']+bar['Spread misinformation']), width=0.5, color='lightcoral', label='Spreading misinformation')
ax.yaxis.set_major_locator(MultipleLocator(0.2))
ax.set_title("Tweets spreading and combatting\n misinformation on Twitter")
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.grid(which='major', alpha=0.3, linestyle='-')
ax.yaxis.grid(which='minor', linestyle=':')
labels = [item.get_text() for item in ax.get_xticklabels()]
labels[2] = 'Unique\nTweets'
labels[4] = "Retweets \nand Reposts"
ax.set_xticklabels(labels)
plt.show()

bar['Combat misinformation']/(bar['Combat misinformation']+bar['Spread misinformation'])
uniques['Combat misinformation']/(uniques['Combat misinformation']+uniques['Spread misinformation'])


names = {
    0: 'Irrelevant',
    1: 'Combat misinformation',
    2: 'Spread misinformation'
}
test.score.map(names).value_counts().plot.pie(autopct='%1.1f%%', colors=['darkgrey','lightcoral', 'lightgreen'], textprops={'color':"k"})
plt.title("Amount of Misinformation on Twitter \nfor Facemask-related discussions \n(Include reposts)")
plt.show()
test.score.map(names).value_counts(normalize=True)



combat = test[test['score']==1]['humor?'].value_counts(normalize=True)
spread = test[test['score']==2]['humor?'].value_counts(normalize=True)
combat.index = ['No humor', 'Humor']
spread.index = ['No humor', 'Humor']
chart = pd.concat((combat,spread),axis=1)
chart.columns = ['Combatting misinformation', 'Spreading misinformation']
chart1 = chart.iloc[0]
chart2 = chart.iloc[1]
humor='yellow'
plt.bar(['Combatting misinformation', 'Spreading misinformation'], chart1 + chart2, color=['yellowgreen', 'lightcoral'], edgecolor='k')
plt.bar(['Combatting misinformation', 'Spreading misinformation'], chart2-0.005, width=0.79, bottom=chart1, hatch='//////', color=['yellowgreen', 'lightcoral'],edgecolor=humor)
plt.bar([0,1], [0,0], color=humor, label="Humor")
plt.legend(loc="lower right")
plt.grid(alpha=0.3, axis='y')
plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], [0.0,'',0.2,'',0.4,'',0.6,'',0.8,'',1.0])
plt.title("Utilization of Humor in Misinformation-relevant posts")
plt.ylabel("Percentage of total posts")
plt.show()



combat = test[test['humor?']==1]['difficult?'].value_counts(normalize=True)
spread = test[test['humor?']==0]['difficult?'].value_counts(normalize=True)
combat.index = ['No humor', 'Humor']
spread.index = ['No humor', 'Humor']
chart = pd.concat((combat,spread),axis=1)
chart.columns = ['Combatting misinformation', 'Spreading misinformation']
chart1 = chart.iloc[0]
chart2 = chart.iloc[1]
humor='yellow'
plt.bar(['Combatting misinformation', 'Spreading misinformation'], chart1 + chart2, color=['yellowgreen', 'lightcoral'], edgecolor='k')
plt.bar(['Combatting misinformation', 'Spreading misinformation'], chart2-0.005, width=0.79, hatch='//////', color=['yellowgreen', 'lightcoral'],edgecolor=humor)
plt.bar([0,1], [0,0], color=humor, label="Humor")
plt.legend(loc="upper right")
plt.grid(alpha=0.3, axis='y')
plt.yticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], [0.0,'',0.2,'',0.4,'',0.6,'',0.8,'',1.0])
plt.title("Utilization of Humor in Misinformation-relevant posts")
plt.ylabel("Percentage of total posts")
plt.show()



# ______ TTEST ______


stats.ttest_ind(combat.groupby('from_user_id')['score'].count(),spread.groupby('from_user_id')['score'].count())

def cohend(d1, d2):
    # calculate the size of samples
	n1, n2 = len(d1), len(d2)
	# calculate the variance of the samples
	s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
	# calculate the pooled standard deviation
	s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
	# calculate the means of the samples
	u1, u2 = np.mean(d1), np.mean(d2)
	# calculate the effect size
	return (u1 - u2) / s
cohend(combat.groupby('from_user_id')['score'].count(),spread.groupby('from_user_id')['score'].count())


value = 'from_user_followercount'
print(combat.drop_duplicates('from_user_id')[value].mean(), combat.drop_duplicates('from_user_id')[value].std())
print(spread.drop_duplicates('from_user_id')[value].mean(), spread.drop_duplicates('from_user_id')[value].std())
print(cohend(combat.drop_duplicates('from_user_id')[value],spread.drop_duplicates('from_user_id')[value]))
stats.ttest_ind(combat.drop_duplicates('from_user_id')[value],spread.drop_duplicates('from_user_id')[value])

value = 'humor?'
print(combat[value].mean(), combat[value].std())
print(spread[value].mean(), spread[value].std())
print(cohend(combat[value],spread[value]))
print(combat.shape[0] + spread.shape[0] - 2)
stats.ttest_ind(combat[value],spread[value])

value = 'difficult?'
print(combat[value].mean(), combat[value].std())
print(spread[value].mean(), spread[value].std())
print(cohend(combat[value],spread[value]))
stats.ttest_ind(combat[value].fillna(0),spread[value].fillna(0))

value = 'difficult?'
print(test[test['humor?'] == 1][value].mean(), test[test['humor?'] == 1][value].std())
print(test[test['humor?'] == 0][value].mean(), test[test['humor?'] == 0][value].std())
print(cohend(test[test['humor?'] == 1][value],test[test['humor?'] == 0][value]))
stats.ttest_ind(test[test['humor?'] == 1][value],test[test['humor?'] == 0][value].fillna(0))



# ______ VISUALIZATION 2 ______

def tukeys_method(df, variable):
    #Takes two parameters: dataframe & variable of interest as string
    q1 = df[variable].quantile(0.25)
    q3 = df[variable].quantile(0.75)
    iqr = q3-q1
    inner_fence = 1.5*iqr
    outer_fence = 3*iqr
    
    #inner fence lower and upper end
    inner_fence_le = q1-inner_fence
    inner_fence_ue = q3+inner_fence
    
    #outer fence lower and upper end
    outer_fence_le = q1-outer_fence
    outer_fence_ue = q3+outer_fence
    
    outliers_prob = []
    outliers_poss = []
    for index, x in enumerate(df[variable]):
        if x <= outer_fence_le or x >= outer_fence_ue:
            outliers_prob.append(index)
    for index, x in enumerate(df[variable]):
        if x <= inner_fence_le or x >= inner_fence_ue:
            outliers_poss.append(index)
    return outliers_prob, outliers_poss

to_drop = tukeys_method(combat, 'from_user_followercount')[0] + tukeys_method(combat, 'from_user_followercount')[1]
combat_clean = combat.copy().reset_index()
combat_clean.drop(to_drop, axis=0, inplace=True)
spread_clean = spread.copy().reset_index()
to_drop = tukeys_method(spread, 'from_user_followercount')[0] + tukeys_method(spread, 'from_user_followercount')[1]
spread_clean.drop(to_drop, axis=0, inplace=True)
plt.title("Follower base of users participating in \nfacemask misinformation-related Twitter discourse")
plt.boxplot([combat_clean.from_user_followercount,spread_clean.from_user_followercount])
plt.ylabel("Number of followers")
plt.xticks([1,2],["Users combatting\n misinformation", "Users spreading\n misinformation"])
plt.show()



test.created_at = test.created_at.astype("datetime64")
interim = test.groupby(test['created_at'].dt.date)['score'].value_counts()
interim2 = interim.unstack().fillna(0)
interim2.columns = ['General discussion', 'Combat misinformation', 'Spread misinformation']
interim2.plot(kind='line', color=['darkgrey','lightgreen','lightcoral'])
plt.title("Tweets mentioning facemasks overtime")
plt.grid(True,alpha=0.2)
plt.xlabel("Month")
plt.ylabel("Number")
plt.show()


# ______ FIGURES ______

matplotlib.get_cachedir()
matplotlib.font_manager.findSystemFonts(fontpaths='Calibri', fontext='ttf')
font_dirs = 'Calibri'
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
plt.rcParams['font.family'] = 'Calibri'


fig = plt.figure(figsize=(15,5), frameon=False)
plt.tight_layout()
ax = fig.add_subplot(111)
ax.plot(interim3.iloc[:,0].cumsum()/interim2.sum(axis=1).cumsum()*100, linewidth=4, color=COMBAT, label='Distancing from Misinformation')
ax.plot(interim3.iloc[:,1].cumsum()/interim2.sum(axis=1).cumsum()*100, linewidth=4, color=SPREAD, label='Spreading Misinformation')
ax2 = ax.twinx()
ax2.fill_between(interim2.index, interim2.sum(axis=1).cumsum(), 0.0, color='grey', alpha=0.3, label = 'General Discussion')
ax.grid(alpha=0.2)

#X labels
locs, labels = plt.xticks()
plt.xticks(locs, labels=['Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

# Legend
l = ax.legend(loc="upper left", frameon=False,labelspacing=0.75)
for i,text in enumerate(l.get_texts()):
    text.set_color(colors[i])
    
# Remove the lines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)

plt.legend(loc="lower right", frameon=False, bbox_to_anchor=(0.97, 0.0))

# Add lines for significant dates
plt.axvline(dt.datetime(2020, 10, 23), color='grey')
plt.axvline(dt.datetime(2020, 8, 15) , color='grey')
plt.axvline(dt.datetime(2020, 9, 18), color='grey')

ax.set_ylabel("Percentage of \nFacemask-related Tweets (%)")
ax2.set_ylabel("Total number of \n Facemask-related tweets", rotation=270, labelpad=45)
# ax2.set_ylim(0, 35)
# ax.set_ylim(-20,100)
plt.savefig("Timeline_pct_3.png", transparent=True, bbox_inches='tight')
plt.show()





combat = test[test['score']==1]['humor?'].value_counts(normalize=True)
spread = test[test['score']==2]['humor?'].value_counts(normalize=True)
combat.index = ['No humor', 'Humor']
spread.index = ['No humor', 'Humor']
chart = pd.concat((combat,spread),axis=1)
chart.columns = ['Combating\nmisinformation', 'Spreading\nmisinformation']
chart1 = chart.iloc[0]
chart2 = chart.iloc[1]
humor='white'
plt.figure(figsize=(15,5))
plt.barh(['Combating\nmisinformation', 'Spreading\nmisinformation'], chart1 + chart2, color=[COMBAT, SPREAD], edgecolor='k')
plt.barh(['Combating\nmisinformation', 'Spreading\nmisinformation'], chart2-0.005, height=0.78, hatch='///////', color=[COMBAT, SPREAD],edgecolor=humor)
plt.barh([0,1], [0,0], color=humor, label="Use of Humor")
plt.legend(loc="upper right", bbox_to_anchor=(0.96, 0.98))
plt.grid(alpha=0.3, axis='y')
plt.xticks([0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], ['0%','','20%','','40%','','60%','','80%','','100%'])
plt.xlabel("Percentage of posts")
plt.tight_layout()
plt.yticks([])
plt.savefig("bar2.png", transparent=True,bbox_inches='tight')
plt.show()



